{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter - 5\n",
    "#  Applied Questions "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression,LinearRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "import statsmodels.api as sm\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# to ignore warnings\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=FutureWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 4)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>default</th>\n",
       "      <th>student</th>\n",
       "      <th>balance</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>729.526495</td>\n",
       "      <td>44361.625074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>817.180407</td>\n",
       "      <td>12106.134700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>1073.549164</td>\n",
       "      <td>31767.138947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>529.250605</td>\n",
       "      <td>35704.493935</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>785.655883</td>\n",
       "      <td>38463.495879</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  default student      balance        income\n",
       "1      No      No   729.526495  44361.625074\n",
       "2      No     Yes   817.180407  12106.134700\n",
       "3      No      No  1073.549164  31767.138947\n",
       "4      No      No   529.250605  35704.493935\n",
       "5      No      No   785.655883  38463.495879"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "default = pd.read_csv(r'E:\\programming\\dataset\\Into_to_statstical_learning\\Default.csv',index_col=0)\n",
    "print(default.shape)\n",
    "default.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "encoding_dict = {'Yes':1,'No':0}\n",
    "default['default'] = default['default'].map(encoding_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (a) Fit a logistic regression model that uses income and balance to predict default.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train accuracy is  0.9664\n"
     ]
    }
   ],
   "source": [
    "lr = LogisticRegression()\n",
    "lr.fit(default[['income','balance']],default['default'])\n",
    "pred = lr.predict(default[['income','balance']])\n",
    "print('Train accuracy is ',accuracy_score(pred,default['default']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (b) Using the validation set approach, estimate the test error of this model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of X_train  (7000, 2)\n",
      "Shape of X_valid  (3000, 2)\n"
     ]
    }
   ],
   "source": [
    "# splitting the data into train and test\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(default.drop(['default','student'],axis = 1),default['default'],test_size = 0.3,\n",
    "                                                  random_state = 0)\n",
    "print('Shape of X_train ',X_train.shape)\n",
    "print('Shape of X_valid ',X_valid.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='warn', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='warn', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fit the logisitc regression on training data\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# obtain the predictions\n",
    "pred = lr.predict(X_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation error is  0.038000000000000034\n"
     ]
    }
   ],
   "source": [
    "# computing the validation error\n",
    "print('Validation error is ', 1 - accuracy_score(pred,y_valid))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (c) Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of X_train  (7000, 2)\n",
      "Shape of X_valid  (3000, 2)\n",
      "Validation error is  0.030333333333333323\n"
     ]
    }
   ],
   "source": [
    "# random_state = 1\n",
    "# splitting the data into train and test\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(default.drop(['default','student'],axis = 1),default['default'],test_size = 0.3,\n",
    "                                                  random_state = 1)\n",
    "print('Shape of X_train ',X_train.shape)\n",
    "print('Shape of X_valid ',X_valid.shape)\n",
    "\n",
    "# fit the logisitc regression on training data\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train,y_train)\n",
    "\n",
    "# obtain the predictions\n",
    "pred = lr.predict(X_valid)\n",
    "\n",
    "# computing the validation error\n",
    "print('Validation error is ', 1 - accuracy_score(pred,y_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of X_train  (7000, 2)\n",
      "Shape of X_valid  (3000, 2)\n",
      "Validation error is  0.03400000000000003\n"
     ]
    }
   ],
   "source": [
    "# random_state = 5\n",
    "# splitting the data into train and test\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(default.drop(['default','student'],axis = 1),default['default'],test_size = 0.3,\n",
    "                                                  random_state = 5)\n",
    "print('Shape of X_train ',X_train.shape)\n",
    "print('Shape of X_valid ',X_valid.shape)\n",
    "\n",
    "# fit the logisitc regression on training data\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train,y_train)\n",
    "\n",
    "# obtain the predictions\n",
    "pred = lr.predict(X_valid)\n",
    "\n",
    "# computing the validation error\n",
    "print('Validation error is ', 1 - accuracy_score(pred,y_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of X_train  (7000, 2)\n",
      "Shape of X_valid  (3000, 2)\n",
      "Validation error is  0.037666666666666626\n"
     ]
    }
   ],
   "source": [
    "# random_state = 10\n",
    "# splitting the data into train and test\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(default.drop(['default','student'],axis = 1),default['default'],test_size = 0.3,\n",
    "                                                  random_state = 10)\n",
    "print('Shape of X_train ',X_train.shape)\n",
    "print('Shape of X_valid ',X_valid.shape)\n",
    "\n",
    "# fit the logisitc regression on training data\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train,y_train)\n",
    "\n",
    "# obtain the predictions\n",
    "pred = lr.predict(X_valid)\n",
    "\n",
    "# computing the validation error\n",
    "print('Validation error is ', 1 - accuracy_score(pred,y_valid))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (d) Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "default['student'] = default['student'].map(encoding_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of X_train  (7000, 3)\n",
      "Shape of X_valid  (3000, 3)\n",
      "Validation error is  0.030333333333333323\n"
     ]
    }
   ],
   "source": [
    "# splitting the data into train and test\n",
    "# we only drop default for the train data\n",
    "# NOTE - We are using random state = 1\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(default.drop(['default'],axis = 1),default['default'],test_size = 0.3,\n",
    "                                                  random_state = 1)\n",
    "print('Shape of X_train ',X_train.shape)\n",
    "print('Shape of X_valid ',X_valid.shape)\n",
    "\n",
    "# fit the logisitc regression on training data\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train,y_train)\n",
    "\n",
    "# obtain the predictions\n",
    "pred = lr.predict(X_valid)\n",
    "\n",
    "# computing the validation error\n",
    "print('Validation error is ', 1 - accuracy_score(pred,y_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can see above that while using balance and income for predicting default, validation error was 30.0333. When we added \n",
    "# a new dummy variable student, the error remains the same ( random_state = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 4)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>default</th>\n",
       "      <th>student</th>\n",
       "      <th>balance</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>729.526495</td>\n",
       "      <td>44361.625074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>817.180407</td>\n",
       "      <td>12106.134700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1073.549164</td>\n",
       "      <td>31767.138947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>529.250605</td>\n",
       "      <td>35704.493935</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>785.655883</td>\n",
       "      <td>38463.495879</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   default  student      balance        income\n",
       "1        0        0   729.526495  44361.625074\n",
       "2        0        1   817.180407  12106.134700\n",
       "3        0        0  1073.549164  31767.138947\n",
       "4        0        0   529.250605  35704.493935\n",
       "5        0        0   785.655883  38463.495879"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(default.shape)\n",
    "default.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (a) Using the summary() and glm() functions, determine the estimated standard errors for the coefficients associated with income and balance in a multiple logistic regression model that uses both predictors.\n",
    "\n",
    "\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.078948\n",
      "         Iterations 10\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                default   No. Observations:                10000\n",
      "Model:                          Logit   Df Residuals:                     9997\n",
      "Method:                           MLE   Df Model:                            2\n",
      "Date:                Sun, 12 Jul 2020   Pseudo R-squ.:                  0.4594\n",
      "Time:                        16:48:04   Log-Likelihood:                -789.48\n",
      "converged:                       True   LL-Null:                       -1460.3\n",
      "Covariance Type:            nonrobust   LLR p-value:                4.541e-292\n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const        -11.5405      0.435    -26.544      0.000     -12.393     -10.688\n",
      "balance        0.0056      0.000     24.835      0.000       0.005       0.006\n",
      "income      2.081e-05   4.99e-06      4.174      0.000     1.1e-05    3.06e-05\n",
      "==============================================================================\n",
      "\n",
      "Possibly complete quasi-separation: A fraction 0.14 of observations can be\n",
      "perfectly predicted. This might indicate that there is complete\n",
      "quasi-separation. In this case some parameters will not be identified.\n"
     ]
    }
   ],
   "source": [
    "X = default[['balance','income']]\n",
    "X = sm.add_constant(X)\n",
    "y = default['default']\n",
    "\n",
    "results = sm.Logit(y,X).fit()\n",
    "print(results.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# std error for balance = 0, std error for incomes = 4.99e-06"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (b) Write a function, boot.fn(), that takes as input the Default data set as well as an index of the observations, and that outputs the coefficient estimates for income and balance in the multiple logistic regression model.\n",
    "\n",
    " \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# to get random indices of given size\n",
    "# since the classes are imbalanced, we want the to random indeces to contain botht the classes, otherwise it will preocude an e\n",
    "# error, as model can't train if all observations belong to one class\n",
    "\n",
    "def get_indices(data,num_samples):\n",
    "    positive_data = data[data['default'] == 1]\n",
    "    negative_data = data[data['default'] == 0]\n",
    "    \n",
    "    positive_indices = np.random.choice(positive_data.index, int(num_samples / 4), replace=True)\n",
    "    negative_indices = np.random.choice(negative_data.index, int(3*num_samples / 4), replace=True)\n",
    "    total = np.concatenate([positive_indices,negative_indices])\n",
    "    np.random.shuffle(total)\n",
    "    return total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# similar to boot.fn in exercise\n",
    "\n",
    "def boot_fn(data,index):\n",
    "    X = data[['balance','income']].loc[index]\n",
    "    y = data['default'].loc[index]\n",
    "    \n",
    "    lr = LogisticRegression()\n",
    "    lr.fit(X,y)\n",
    "    intercept = lr.intercept_\n",
    "    coef_balance = lr.coef_[0][0]\n",
    "    coef_income = lr.coef_[0][1]\n",
    "    return [intercept,coef_balance,coef_income]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Intercept is [-1.95810954e-06], the coeff of balance is 0.0013866377368020324 , the coeff for income is -8.907163088269097e-05 \n"
     ]
    }
   ],
   "source": [
    "# example - \n",
    "intercept,coef_balance,coef_income = boot_fn(default,get_indices(default,100))\n",
    "print('Intercept is {}, the coeff of balance is {} , the coeff for income is {} '.format(intercept,coef_balance,coef_income))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (c) Use the boot() function together with your boot.fn() function to estimate the standard errors of the logistic regression coefficients for income and balance.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def boot(data,func,R):\n",
    "    intercept = []\n",
    "    coeff_balance = []\n",
    "    coeff_income = []\n",
    "    for i in range(R):\n",
    "        \n",
    "        [inter,balance,income] = func(data,get_indices(data,100))\n",
    "        intercept.append(float(inter))\n",
    "        coeff_balance.append(balance)\n",
    "        coeff_income.append(income)\n",
    "        \n",
    "    intercept_statistics = {'estimated_value':np.mean(intercept),'std_error':np.std(intercept)}   \n",
    "    balance_statistics = {'estimated_value':np.mean(coeff_balance),'std_error':np.std(coeff_balance)}\n",
    "    income_statistics = {'estimated_value':np.mean(coeff_income),'std_error':np.std(coeff_income)}\n",
    "    return {'intercept':intercept_statistics,'balance_statistices':balance_statistics,'income_statistics':income_statistics}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = boot(default,boot_fn,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Balance -  {'estimated_value': 0.001567729988519676, 'std_error': 0.00036705765926319634}\n",
      "Income -  {'estimated_value': -8.184141421630866e-05, 'std_error': 1.5308635905371447e-05}\n"
     ]
    }
   ],
   "source": [
    "print('Balance - ',results['balance_statistices'])\n",
    "print('Income - ', results['income_statistics'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can see that the standard errors obtained from model and from boostrap are similar"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7. In Sections 5.3.2 and 5.3.3, we saw that the cv.glm() function can be used in order to compute the LOOCV test error estimate. Alternatively, one could compute those quantities using just the glm() and predict.glm() functions, and a for loop. You will now take this approach in order to compute the LOOCV error for a simple logistic regression model on the Weekly data set.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1089, 9)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Year</th>\n",
       "      <th>Lag1</th>\n",
       "      <th>Lag2</th>\n",
       "      <th>Lag3</th>\n",
       "      <th>Lag4</th>\n",
       "      <th>Lag5</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Today</th>\n",
       "      <th>Direction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1990</td>\n",
       "      <td>0.816</td>\n",
       "      <td>1.572</td>\n",
       "      <td>-3.936</td>\n",
       "      <td>-0.229</td>\n",
       "      <td>-3.484</td>\n",
       "      <td>0.154976</td>\n",
       "      <td>-0.270</td>\n",
       "      <td>Down</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1990</td>\n",
       "      <td>-0.270</td>\n",
       "      <td>0.816</td>\n",
       "      <td>1.572</td>\n",
       "      <td>-3.936</td>\n",
       "      <td>-0.229</td>\n",
       "      <td>0.148574</td>\n",
       "      <td>-2.576</td>\n",
       "      <td>Down</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1990</td>\n",
       "      <td>-2.576</td>\n",
       "      <td>-0.270</td>\n",
       "      <td>0.816</td>\n",
       "      <td>1.572</td>\n",
       "      <td>-3.936</td>\n",
       "      <td>0.159837</td>\n",
       "      <td>3.514</td>\n",
       "      <td>Up</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1990</td>\n",
       "      <td>3.514</td>\n",
       "      <td>-2.576</td>\n",
       "      <td>-0.270</td>\n",
       "      <td>0.816</td>\n",
       "      <td>1.572</td>\n",
       "      <td>0.161630</td>\n",
       "      <td>0.712</td>\n",
       "      <td>Up</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1990</td>\n",
       "      <td>0.712</td>\n",
       "      <td>3.514</td>\n",
       "      <td>-2.576</td>\n",
       "      <td>-0.270</td>\n",
       "      <td>0.816</td>\n",
       "      <td>0.153728</td>\n",
       "      <td>1.178</td>\n",
       "      <td>Up</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction\n",
       "0  1990  0.816  1.572 -3.936 -0.229 -3.484  0.154976 -0.270      Down\n",
       "1  1990 -0.270  0.816  1.572 -3.936 -0.229  0.148574 -2.576      Down\n",
       "2  1990 -2.576 -0.270  0.816  1.572 -3.936  0.159837  3.514        Up\n",
       "3  1990  3.514 -2.576 -0.270  0.816  1.572  0.161630  0.712        Up\n",
       "4  1990  0.712  3.514 -2.576 -0.270  0.816  0.153728  1.178        Up"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weekly = pd.read_csv(r'E:\\programming\\dataset\\Into_to_statstical_learning\\Weekly.csv')\n",
    "print(weekly.shape)\n",
    "weekly.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly['Direction'] = weekly['Direction'].map({'Down':0,'Up':1})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (a) Fit a logistic regression model that predicts Direction using Lag1 and Lag2.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='warn', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='warn', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = weekly[['Lag1','Lag2']]\n",
    "y = weekly['Direction']\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X,y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (b) Fit a logistic regression model that predicts Direction using Lag1 and Lag2 using all but the first observation.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='warn', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='warn', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = weekly[['Lag1','Lag2']].drop(0,axis = 0)\n",
    "y = weekly['Direction'].drop(0,axis = 0)\n",
    "\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X,y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (c) Use the model from (b) to predict the direction of the first observation. You can do this by predicting that the first observation will go up if P(Direction=\"Up\"|Lag1, Lag2) > 0.5. Was this observation correctly classified?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Predicted - 1, true value - 0\n"
     ]
    }
   ],
   "source": [
    "X_test = weekly[['Lag1','Lag2']].loc[[0]]\n",
    "y_test = weekly['Direction'].loc[0]\n",
    "\n",
    "print('Predicted - {}, true value - {}'.format(int(lr.predict(X_test)),y_test))\n",
    "# it is wrongly classified"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (d) Write a for loop from i = 1 to i = n, where n is the number of observations in the data set, that performs each of the following steps:\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "score = []\n",
    "for i in range(len(weekly)):\n",
    "    X = weekly[['Lag1','Lag2']].drop(i,axis = 0)\n",
    "    y = weekly['Direction'].drop(i,axis = 0)\n",
    "\n",
    "    lr = LogisticRegression()\n",
    "    lr.fit(X,y)\n",
    "    \n",
    "    X_test = weekly[['Lag1','Lag2']].loc[[i]]\n",
    "    y_test = weekly['Direction'].loc[i]\n",
    "    \n",
    "    pred = int(lr.predict(X_test))\n",
    "    score.append(int(pred == y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (e) Take the average of the n numbers obtained in (d)iv in order to obtain the LOOCV estimate for the test error. Comment on the results.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy is  0.5500459136822773\n",
      "LOOCV error is  0.4499540863177227\n"
     ]
    }
   ],
   "source": [
    "print('Accuracy is ',np.mean(score))\n",
    "print('LOOCV error is ',1 - np.mean(score))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. We will now perform cross-validation on a simulated data set."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (a) Generate a simulated data set as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(1)\n",
    "y = np.random.normal(size = 100)\n",
    "X = np.random.normal(size = 100)\n",
    "y = X - 2*(X**2) + np.random.normal(size = 100) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "N is 100 (Number of examples)\n",
    "p is 2 (number of features, ie X and X^2)\n",
    "\n",
    "Equation for model is - \n",
    "Y = X - 2*X^2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (b) Create a scatterplot of X against Y . Comment on what you find"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 1.0, 'Scatter plot')"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAEWCAYAAABIVsEJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3de5hcVZnv8e9b3emk02noJjcCCQYVo5GJQloQMqjIZZCDcsJlZBCCNxIGGc84jIMehpGR8TkEUI8cJ4aAHCDICCIoA8h1cDgGBDoCERJCuE6axKQTEuh0mu5013v+qAvV1VXV1burau/q+n2eJ0+q9t6196pKZb211rv2WubuiIiIjFQs7AKIiEh1UgAREZFAFEBERCQQBRAREQlEAURERAJRABERkUAUQETGKDNzM3t/2OWQsUsBRGqamf25mT1mZm+Z2ZtmtsrMPjbKc37RzH6Xte0GM/uX0ZW2PHKVV6QY9WEXQCQsZrYXcDfw18BtQANwFNAbZrlyMbN6d+8PuxwimdQCkVr2AQB3/zd3H3D3Hnd/wN3XpA4ws3PNbJ2ZdZnZWjM7NLn9W2b2csb2hcntHwKWA0eY2S4z22lmi4EvAP+Q3PbvyWP3M7Nfmlmnmb1qZl/PuO6lZna7md1sZm8DX8wufLJVs9zMHkyW4z/N7D253qiZ7W1mNyWv9bqZ/aOZxXKVtzQfrdQCBRCpZS8CA2Z2o5l9xsxaM3ea2enApcAiYC/gc8D25O6XSbRW9gb+GbjZzGa4+zrgPOBxd5/k7i3uvgL4GXBFcttnzSwG/DvwLLA/cAzwt2b2FxlFOBm4HWhJvj6XLwCXAVOAZwoc93+SZX0v8Mnke/pSrvIW/shE3qUAIjXL3d8G/hxw4Fqg08zuMrPpyUO+SqLSf8oTXnL315Ov/YW7b3L3uLvfCmwADhvB5T8GTHX377p7n7u/kizDGRnHPO7uv0peoyfPee5x90fdvRe4mERLYlbmAWZWB3we+La7d7n7a8D3gbNHUF6RIRRApKa5+zp3/6K7zwQOBvYD/ndy9ywSLY0hzGyRmT2T7KLamXztlBFc+j3AfqnXJ8/xP4HpGcdsLOI86WPcfRfwZvI9ZJpCIr/zesa210m0fEQCUxJdJMndXzCzG4AlyU0bgfdlH5fMM1xLotvpcXcfMLNnAEudKtfps55vBF5194MKFamIYqdbG2Y2CdgH2JR1zDZgD4mgtTa57QDgjRFcR2QItUCkZpnZB83sQjObmXw+C/gr4PfJQ64D/t7M5lvC+5PBo4lEpduZfN2XSLRAUrYAM82sIWvbezOePwm8bWYXmVmjmdWZ2cEBhhCfmByK3EAiF/KEuw9qubj7AIlRZt8zs+bke/g74OYC5RUZlgKI1LIu4HDgCTPrJhE4ngMuhESeA/gecEvy2F8B+7j7WhI5hMdJVL5/BqzKOO9/AM8DfzKzbcltPwXmJrurfpWs1D8LfBR4lUQr4ToSie6RuAX4Domuq/kkkuq5/A3QDbwC/C75uusLlFdkWKYFpUSqU7K7rcPd/zHsskhtUgtEREQCUQAREZFA1IUlIiKBqAUiIiKB1NR9IFOmTPHZs2eHXQwRkaqyevXqbe4+NXt7JANIcjz+TcC+QBxY4e4/yjrmU8CvSQyBBLjD3b9b6LyzZ8+mvb299AUWERnDzOz1XNsjGUCAfuBCd/+DmTUDq83sweT4+0z/z91PCqF8IiI1L5I5EHff7O5/SD7uAtaheXtERCIlkgEkk5nNBg4Bnsix+wgze9bMfmNmH87z+sVm1m5m7Z2dnWUsqYhIbYl0AElODvdL4G+TU29n+gPwHnf/CIm1Dn6V6xzuvsLd29y9berUITkgEREJKLIBxMzGkQgeP3P3O7L3u/vbyemrcfd7gXFmNpLptEVEZBQimUQ3MyMx+dw6d/9BnmP2Bba4u5vZYSSC4fZcx4pEUTzubO/uo69/gIb6OiY3NRCL2fAvFImISAYQYAGJ1dL+mFxnARKL7RwA4O7LgdOAvzazfqAHOMN1W71UiXjcWb+li3NvaqdjRw8zWxu5dlEbc6Y3FwwiCjoSJTU1lUlbW5vrPhApt2Iq+c6uXhYuW0XHjndXqp3Z2sid5y9gavP4vOcNEnRKVWapXWa22t3bsrdHtQUiUpWKreT7+gcGBQ+Ajh099PUP5D339u6+9HlTx597U3vBoDNcWVNBYyDu/Ms9a3lg7daSBiYZ2yKbRBepRvkq+e3dfYOOa6ivY2Zr46BtM1sbaaivy3vuIEEnn1SgW7hsFQuWPsKZ1z3BOUceyCGzWvKWWSSbAohICQ1XycfjTmdXL339A9zy1cM5fu40gPSv/slN+VeVDRJ08skV6C765RrO+9T7hpRZJB91YYmUUKqSz85tNNTX5ezeuubs+Vx28sHEYrFh8w6Tmxq4dlHbkO6xQkEnn3yBrqVx3KAyixSiFohICaUq+VRLIbOSz/Wrf8nK1cRiMaY2jx823xCLGXOmN3Pn+QtYddHR3Hn+gsB5inytmZ09e0YVmKS2qAUiUkKZlXz2iKZS5DBiMQuUMM+WqzVzzdnzmdLUwJ3nL9AoLCmKAohIieWr5At1b1VaoUAnUix1YYlUSKHurTCkAt3+rROL6kITyaYWiEiF6Fe/jDUKICIVVKochkgUKICIlEilpgPRtCMSFQogIkUYrtKu1OSII7lOWIFGAa52aDJFkWHkvAHwrPnMaJlAS2Oiciz15IhAzkq42OuUc+LFQsK6rpRXvskUNQpLZBg5bwC8eTXPbnyL9Vu6iMd91JMjHjKrhUtOmkt3bz9b3n6H17Z3p+epWrhs1YivU+ycXKUW1nUlHAogIsPIV2lPbKhLV47FzFOVmgfrjR270/NhpYLH3//FHC67ey2nLX+c0695nC1vv8PUSePT1xrJdQqVudzzW5XjutmfWzxeO70mUacAIjKMQtN+dOzooWfPAK2N4wre45E9++3CZasYiDszWxs571Pv46Jfrhn0q/2bt787sWFqW1//QNH3kpRy4sXhZFbwZlbS6+b63FKtMQmfciAiw8jVr7/01Hlcdf96Onf1ctnJB7Pv3hM4aOokdvTsoa9/gHH1MepjRk9fIodRF4PP/Xhw7mLJUbM58+OzGYg7G7buYvlvX+bpjTvT+29d/HE+v+L3wOA8RzFJ6nwTN86Z1kx9fel+N2Zf5/i50/j6MR/gvJtXlyQHEiS3JKWnBaVEAkrdAHjH+Ueyu3eAV7d1p4NHZiDJrOBzJd2nThqfrggPmdXCJ+ZM5wvXPTEkKD29cSczWxvZ3Zfo9klVwq2N49JdXw31dczYuzFvpRyLGQdNncQtXz2crV29bO/u40cPvcg3jptT0oR2ds7jgbVbAbhtyRG4+6hHYYXVFSfFUQARKUIsZkxrnkC8yRlXZ3zrMx9kZ8+edIUPpCu1fEn3y04+mC/d8BRAzm6ri365hktOmstld6/l2kVtTN9rPKsuOpqG+jpaG8exoXMXP3xwPafOn8XkpgZ6+vrZb+/GvC2KHT17ODMZoFLWbu7K+es96NDbXBX8A2u38p3POvu3Tizmoy0oSvOHyVAKICIjEIsZDfV1XPiLZ/NWavl+NR84pSldGU5uash5zIf2bR48G25TYl9nVy8/fHA95xx5YDrwpLqlPrTvXjm7sHr29Bf16300Q2/LXcGXcg0UKT0l0UVGaLhEdr4E9sTxdem1PPZracx5TGNDfc6JDfv6Bzh1/qwhrZYlK1cPGSKbCggvb+0eco0lR80G4PXt3Wza2UN/f3xUQ2/LPUFkKddAkdKLbAvEzE4AfgTUAde5++VZ+8cDNwHzge3A5939tUqXU2pPrkkRWxvHpbuAGhvqBv1qPn7uNP7xv81lT388nbsARvTLOtWtNFyLIh53/vT2O3T39jOuzvjxmYdwwS1PJ4LNUbM56aMz+fyK36evufys+bRMHBc4z1CqCSILdaFp/rDoimQAMbM64F+B44AO4Ckzu8vd12Yc9hVgh7u/38zOAJYCn698aaUWZVZqubqAbvryYfz6giPp73c6d/WlcxGZ3UMjqXhTOY9Cy+Xu7Olj8853WJIxAur7p3+EK0+bRyw5vDYVPCARJK5++EW+89kPj6obarQVvO5er15R7cI6DHjJ3V9x9z7g58DJWcecDNyYfHw7cIyZ6dsmFZerC+jy36zjT2/1snZzV3pIa2pfqntoJOtxxGLGfnsnch7Z3UWtjeNYv6WLZze+lQ4eqWtd+Itnefudfi78xbP0x31QkDhkVgvnHHkg//zvz7P01HmhrVOiu9erVyRbIMD+wMaM5x3A4fmOcfd+M3sLmAxsyzzIzBYDiwEOOOCAcpVXaliupPmp82exZOVqvn/6R0o2DLW+PsaH9t1rSKslVQHnu1YqTzGuLjaopZE5Eqyzq49LTprL5KYG9mtpZN+9JlTs17+G6lavqLZAcn1zs+94LOYY3H2Fu7e5e9vUqVNLUjiRTLmS5ql8xc6ePSW9MztXqyUej3PJSXOZ1jw+57X2a2lkzvRmpk0az/Kz3m3BZOZUnt64kyUrV3Pa8sdJ3VxcqelDKnnXvJRWVANIBzAr4/lMYFO+Y8ysHtgbeLMipRPJkGskUqoyX/7bl8vaPRSPO9u6+7js7rX83W3PcuVpQ6+Vak3U18f44PRmbltyBI9+81PM2HtCnpFgdRWdPqTYkVyaEyt6IjmVSTIgvAgcA7wBPAWc6e7PZxzzNeDP3P28ZBL9FHf/y0Ln1VQmUi7Zo4haJtSzfusuzrt5NVMnjefrxxzE7ClNNDXUMWVS6dYfz57q45BZLXz9mIN479QmGupixAxisVjR051cu6iNyZMaOGXZY0OS6necfyTTmieUpNzZyrXeipRGVU1lksxpXADcT2IY7/Xu/ryZfRdod/e7gJ8CK83sJRItjzPCK7HUuuyRSJ1dvVz98ItcctJcWhrHsbNnD//r3rV8b+G8klZ42fmDpzfu5Es3PMV/fvNTnH7N4wUr2+whuKn5u7reyX0D4u7eAeJNnuw2C75oVL7XFhrJlS/RrjmxwhXJAALg7vcC92Zt+6eMx+8Ap1e6XFIdwl4Vr69/gAfWbk3PDZXync+WNjGc707wVzq7i6psUxV35i/8S06am/Ocr27rpml8PZObGnK2BjInk8z+zFP/HvF4nG3dfSxZObLJFpVoj6ao5kBEAgtjCvDs/vlx9bGKJIZz5Q+uOWs+Vz+8YdBxI1ncavlvX+YnXzh00DmXnjqPqx/eQF//QM7WwA8fXM/6rbk/88x/j2c63koHj9Rrixmyq0R7NEW2BSISVLm6O/r742zd1cuegTjj6mJMmzSe+vpY3hsJi7nTfLQtpVx3gtfFoHNX76DjhqtsM3/hP71xJ7t6+7ns5IOZ2FCXnjSyc1cvDfV1BYct5/rMgfTn0NIY7K53zYkVTQogMuYU090x0oq7vz/OC1u6Bq1zsfys+XxwejM7evYMCViLrn+Suy5YUPBO81IlhrPzB/G4D1vZZr//VIsp9R6uuG89/3DCnPSkkZnn2N7dN6SLa7hpVlL7UsOaR3rXe6mmTJHSUgCRMWe4GWKDVNxbd/UOuaP8vJtXp9e9yFV59vQNFJzSfLQtpXxBcLjKtpgWU+euXqbvNYE7zj8yPYdX6hy5WgP7NDUU/MxT+1LDmjNnFC7Ukgg7lyWFKYDImDNcd0eQinvPQDxnkOgfiNPYUB/oV/VoEsPDBcFCo5pyvf9iWkwp2QEK4EcPbRgSGK45a376M0/9ezy9cSc3PvYqt3z1cOqSU+Pnu46G7kafAoiMOZkVXDweZ8BhIB5n81s9NDbk7sMfruLOngYEEkFiwrg6HOfmrxzOq9u6ufrhDXTu6s25gmB2RTmatTTyBcFiVgLM9/6HazFlygxQnV29PPbKdjZs3ZUetry7b4AZLe9OhxKk+ylfsv7Szx1cktUOZfQUQGRMSnW1ZP+CvfK0ecyePDFnxW1mvLFjd86KKTUNSGYO5IYvfYztu/o4d+XgpWtntExgr/GJFQQL/XoeTWI4XxDYtLOH05Y/XvDXena+I/X+g45oynwfS1auTl+7pfHd9xFkxt7s95ia/PEvh7m/RSonkneil4vuRK8t2XdpQ6KivOr0j9A8oX7QvQjLz5rP1Q+/yANrt+atmFKjsPoH4tTXJW66O+UnQ+/YTo08ynXt7G6yoH38+d7bJSfNZcnK1QWv99r2bra8/Q7fvH1NySricuQqst/jNWfP57K71w77mUrpVdWd6CKlkO9XugFTmhrSXSpmxqV3PZe+6S9fTqS+PsZ+Le/ei/DGjt1FjTzKtS8l6FoauVovS0+dx1X3ry94ve3dfSy6/kmmTho/qLtp+l6jm16lHIs+Zb/HYhbUkspSAJGqVMwv3nw5ht19A8RisXSF98aO3UPuGC9UMaWunTrfcCOPcu0brVSe59bFH6djRw/7NDVw5f0v8PTGnQWvlwqqHTt60i0VgFUXHZ1ef71Y5R4hlZ2st+SiWOX6TGXkdCe6VJ1i7zTPdZf2lafN4z2TJw7KM4zkLufMa19wy9M5Z7+d3NRQ9rXCIVHBNtTXceEvnuUfbl/DOUceOOz1Rvpe881+W6m7/TOnr993rwll/0xlZJQDkaqTr/8/V194YrrzXt7ZE6fOoLGhjpbG4Df05Zv99n3TJtE4Lvf8T+VaKzy77KlZfw+c0sTE8XVMaRraLVXsex3uuJH8G5SS7gsJR74ciAKIVJ03duxmwdJHhmxfddHRRQ9DzVZsxVSOaxcqU7GV/Ugq1WKOzxcgUsOEB9z5xBW/HXLu0X4OChDRlC+AqAtLqk45JtYrdn3ySk7qV+xa4SNZW73Y4wsNE16w9BFe3tpd8s8hjEkwZXQUQKTqVCK/EIVrhzmFeb5AmQpeVz+8IW/+J6hiA6ZEh0ZhSdUJc2K9Sl57NHeqj9Zww4Sf3riTK+5bz62LP54u62g/B635UX0UQKQqleO+g6hdO8wpzHMNob30rucGDRPu3JVY92S0y9ym8h4D7vzfL36Mqx/ekL6OhulGmwKISESFPYV5ZqB8s7uXLy04kLWbu9LBbNkXDmUgHqezqzdwuXINFLjytHlccd/69JxiGqYbXQogIhEWZksrU0/fAFfct55LTprLtObx7N04jst/s67g1C/FyJX3+Obta7h18cc1CqsKKIkuIsNqqK+jc1cvS1auZmtXL4uuf3LI1C9Bkt358h5AUSPKJFwKICIyrMzRZ0GXpc1Fa51Xt8gFEDO70sxeMLM1ZnanmbXkOe41M/ujmT1jZro7UKSMMvMxM1sbS1bphzkkW0Yvcneim9nxwH+4e7+ZLQVw94tyHPca0Obu24o9t+5EFxm9Uq8UqLvPo69qpnN39wcynv4eOC2sskh1U8VUHqUeHRaVgQIycpELIFm+DNyaZ58DD5iZA9e4+4pcB5nZYmAxwAEHHFCWQkr0aD3t8lKlLxBSDsTMHjKz53L8OTnjmIuBfuBneU6zwN0PBT4DfM3MPpHrIHdf4e5t7t42derUkr8XiSZNiyFSfqG0QNz92EL7zewc4CTgGM+TpHH3Tcm/t5rZncBhwKOlLquEL0hXlKbFECm/yHVhmdkJwEXAJ919d55jmoCYu3clHx8PfLeCxZQKCdoVFeY8UiK1InLDeIEfA83Ag8khussBzGw/M7s3ecx04Hdm9izwJHCPu98XTnGlnIJ2RWl4qEj5Ra4F4u7vz7N9E3Bi8vErwEcqWS4JR9CuqLDnkRKpBZELICKZRtMVpZFCIuUVxS4skTR1RYlEl1ogEmnqihKJLgUQiTx1RYlEkwKIlISmDRGpPQogMmqlmjZEQUhGQt+X8CmAyKjlu1fjzvMXFN31pLmrZCT0fYkGjcKSUSvFtCGau0pGQt+XaFALREYsu+tgXH1s1NOGaO6q2lGKrid9X6JBLRAZkVTXwcJlq1iw9BEWLlvFrnf6R32vhpY2rQ25vj/rt3QRj49sYTt9X6IhcisSlpNWJBy9zq5eFi5bNaS1cdcFCxiIE/hXpfq0a0O+789I8mWg70ulVc2KhBJt+boOevoG2L91YuDz6obB2lCqrid9X6JBAURGJMjcVMX2eeuGwbGvlNPsl+r7ouHAwSkHIiMy0rmpStXnLWND1OY20/dzdJQDkREbyS+2rV3vcMqyx0bd5y1jR5R+8ZcqJzPWKQciJVNs10E87uzuHb7PO0oVipRflLoqNRx4dBRApGy2d/fx6rbugn3eGk0jYdLSx6OjHIiUTV//AFc/vIGlp84b1Od9zVnz033euqNYwhS1nEy1UQtESiqzO8rM6NzVy1X3r+eSk+bS0jiO3X0DzGiZkG5dqAtBwqThwKOjACIlk90ddfzcaSw/az7n3byaJStXJ1ofZ89nT3+czq5eJjc1qAtBQleKnEyt5vEUQKRksrujHli7FYDblhyBuzMQd/7lnrU8sHZruqvgoKmTuHZR25AciLoQpFrUch4vcjkQM7vUzN4ws2eSf07Mc9wJZrbezF4ys29VupwyVK7uqAfWbsXdaaiv48zrnkgHlVSuY0fPnnQXwqqLjubO8xfUxH88GTtqOY8X1RbID939qnw7zawO+FfgOKADeMrM7nL3tZUqoAxVqDuqUK4jSsM6RUaqlvN4kWuBFOkw4CV3f8Xd+4CfAyeHXKaaV2hEi2ZPlbGqlr/bUQ0gF5jZGjO73sxac+zfH9iY8bwjuW0IM1tsZu1m1t7Z2VmOsta0eNzp7OrljR272d7dx0FTJ+XsjtJwSRmravm7HcpUJmb2ELBvjl0XA78HtgEOXAbMcPcvZ73+dOAv3P2ryednA4e5+98Uuq6mMimtkSYPa3Wkiox9Y/27HampTNz92GKOM7Nrgbtz7OoAZmU8nwlsKkHRZARGuha6ch0yVtXqUODIJdHNbIa7b04+XQg8l+Owp4CDzOxA4A3gDODMChWxJuX6ctdy8lCklKp1KHAUcyBXmNkfzWwNcDTwDQAz28/M7gVw937gAuB+YB1wm7s/H1aBx7p8U16n1kLPVCvJQ5FSqtahwJELIO5+trv/mbvPc/fPpVoj7r7J3U/MOO5ed/+Au7/P3b8XXonHvnxf7vqY1WzyUKSQzMElnV29w64vUq2t+ch1YUn0FFrGVvMIiQwWpDuqWqf0iVwLRKKn0Dj3VPJw/9aJTG0er+AhNS9Id1S1DgVWC0SGlfpya74qkeEF6Y6q1lmBFUBkWNX65RYJQ9DuqGoc5q4uLCmKuqpEilOt3VFBqAUiIlJCtdRiVwARESmxauyOCiJvF5aZ3WtmsytXFBERqSaFciA3AA+Y2cVmNq5C5RERkSqRtwvL3W8zs3uAfwLazWwlEM/Y/4MKlE9ERCJquBzIHqAbGA80kxFARESktuUNIGZ2AvAD4C7gUHffXbFSiYhI5BVqgVwMnK5ZbkVEJJdCOZCjKlkQERGpLroTXUREAlEAERGRQHQnuohIBGhNdImsavxyitQKrYkukZVrTfN1f3qb/n7d1iMSBVoTXSIr15dzycrVbHqrZ9i1mkWk/Kp1TXQFkBqQ78u5tas38r9wRGpBoWWjoyxyAcTMbjWzZ5J/XjOzZ/Ic95qZ/TF5XHuly1lN8n05UzkREQlXtS5CFbkkurt/PvXYzL4PvFXg8KPdfVv5S1XdJjc1cM3Z81mycnU6Qbf01Hnc+NirHHrAvLCLJ1LzqnURqsgFkBQzM+AvgU+HXZZqF4sZc6Y1c8tXD093W9342Kt847g5kf+FI1IrqnERqsgGEOAoYIu7b8iz30msV+LANe6+ItdBZrYYWAxwwAEHlKWg1aC+PsbM1ok0NtQzY+8JHHrAvKr4hSMi0RVKADGzh4B9c+y62N1/nXz8V8C/FTjNAnffZGbTgAfN7AV3fzT7oGRgWQHQ1tZW00OOqvEXjohEVygBxN2PLbTfzOqBU4D5Bc6xKfn3VjO7EzgMGBJARESkPCI3CivpWOAFd+/ItdPMmsysOfUYOB54roLlExGpeVENIGeQ1X1lZvuZ2b3Jp9OB35nZs8CTwD3ufl+FyygiUtMimUR39y/m2LYJODH5+BXgIxUuloiIZIhqC0RERCJOAURERAJRABERkUAUQEREJBAFEBERCUQBREREAlEAERGRQBRAREQkEAUQEREJRAFEREQCUQAREZFAFEBERCQQBRAREQlEAURERAJRABERkUAiuR6IiIiMXjzubO/uo69/gIb6OiY3NRCLWcnOrwAiIjIGxePO+i1dnHtTOx07epjZ2si1i9qYM725ZEFEXVgiImPQ9u6+dPAA6NjRw7k3tbO9u69k11AAEREZg/r6B9LBI6VjRw99/QMlu4YCSJWIx53Orl7e2LGbzq5e4nEPu0giEmEN9XXMbG0ctG1mayMN9XUlu4YCSBVI9WUuXLaKBUsfYeGyVazb/DZvdiuQiEhuk5sauHZRWzqIpHIgk5saSnYNcw+nAjKz04FLgQ8Bh7l7e8a+bwNfAQaAr7v7/TlefyDwc2Af4A/A2e5esHOvra3N29vbCx0SSZ1dvSxctmpQc3RmayOXnXww++49oaRJMREZO0o1CsvMVrt7W/b2MFsgzwGnAI9mbjSzucAZwIeBE4BlZparzbUU+KG7HwTsIBFwxqR8fZkTG+pKnhQTkbEjFjOmNo9n/9aJTG0eX/IfmqEFEHdf5+7rc+w6Gfi5u/e6+6vAS8BhmQeYmQGfBm5PbroR+O/lLG+Y8vVl7uzZU/KkmIhIsaKYA9kf2JjxvCO5LdNkYKe79xc4BgAzW2xm7WbW3tnZWfLCVkKuvsylp85j+W9fLnlSTESkWGW9kdDMHgL2zbHrYnf/db6X5diWnagp5pjERvcVwApI5EDyXDPSYjFjzvRm7jj/SHb3DvDqtm6uun89nbt6S54UExEpVlkDiLsfG+BlHcCsjOczgU1Zx2wDWsysPtkKyXXMmBKLGdOaJxBvcprG1/PjMw8py9QEIiLFimIX1l3AGWY2PjnS6iDgycwDPDF07BHgtOSmc4B8LZoxpdxJMRGRYoUWQMxsoZl1AEcA95jZ/QDu/jxwG7AWuA/4mrsPJF9zr5ntlzzFRcDfmSODjj4AAAdTSURBVNlLJHIiP630exARqWWh3QcShmq9D0REJExRvA9ERESqmAKIiIgEogAiIiKBKICIiEggCiAiIhKIAoiIiASiACIiIoEogIiISCAKICIiEogCiIiIBKIAIiIigSiAiIhIIAogIiISiAKIiIgEogAiIiKBKICIiEggCiAiIhKIAoiIiASiACIiIoEogIiISCChBBAzO93MnjezuJm1ZWw/zsxWm9kfk39/Os/rLzWzN8zsmeSfEytXehERAagP6brPAacA12Rt3wZ81t03mdnBwP3A/nnO8UN3v6qMZRQRkQJCCSDuvg7AzLK3P53x9HlggpmNd/feChZPRESKEOUcyKnA0wWCxwVmtsbMrjez1koWTEREyhhAzOwhM3sux5+Ti3jth4GlwJI8h/wEeB/wUWAz8P0C51psZu1m1t7Z2RngnYiISC5l68Jy92ODvM7MZgJ3Aovc/eU8596Scfy1wN0FyrECWAHQ1tbmQcokIiJDRaoLy8xagHuAb7v7qgLHzch4upBEUl5ERCoorGG8C82sAzgCuMfM7k/uugB4P3BJxhDdacnXXJcx5PeK5FDfNcDRwDcq/R5ERGqduddOr05bW5u3t7eHXQwRkapiZqvdvS17e6S6sEREpHoogIiISCAKICIiEogCiIiIBKIAIiIigSiAiIhIIAogIiISiAKIiIgEogAiIiKBKICIiEggCiAiIhKIAoiIiASiACIiIoEogIiISCAKICIiEogCiIiIBFK2NdHHknjc2d7dR1//AA31dUxuaiAWs7CLJSISKgWQYcTjzvotXZx7UzsdO3qY2drItYvamDO9WUFERGqaurCGsb27Lx08ADp29HDuTe1s7+4LuWQiIuFSABlGX/9AOnikdOzooa9/IKQSiYhEgwLIMBrq65jZ2jho28zWRhrq60IqkYhINIQSQMzsdDN73sziZtaWsX22mfWY2TPJP8vzvH4fM3vQzDYk/24tV1knNzVw7aK2dBBJ5UAmNzWU65IiIlUhrCT6c8ApwDU59r3s7h8d5vXfAh5298vN7FvJ5xeVuIwAxGLGnOnN3Hn+Ao3CEhHJEEoAcfd1AGaBK+GTgU8lH98I/JYyBRBIBJGpzePLdXoRkaoUxRzIgWb2tJn9p5kdleeY6e6+GSD597R8JzOzxWbWbmbtnZ2d5SiviEhNKlsLxMweAvbNsetid/91npdtBg5w9+1mNh/4lZl92N3fDloOd18BrABoa2vzoOcREZHByhZA3P3YAK/pBXqTj1eb2cvAB4D2rEO3mNkMd99sZjOAraMusIiIjEikurDMbKqZ1SUfvxc4CHglx6F3AeckH58D5GvRiIhImYQ1jHehmXUARwD3mNn9yV2fANaY2bPA7cB57v5m8jXXZQz5vRw4zsw2AMcln4uISAWZe+2kBcysE3g97HIUYQqwLexCVJjec+2oxfdd7e/5Pe4+NXtjTQWQamFm7e7eNvyRY4fec+2oxfc9Vt9zpHIgIiJSPRRAREQkEAWQaFoRdgFCoPdcO2rxfY/J96wciIiIBKIWiIiIBKIAIiIigSiARJSZXWlmL5jZGjO708xawi5TueVbJ2YsMrMTzGy9mb2UXJJgzDOz681sq5k9F3ZZKsHMZpnZI2a2Lvm9/h9hl6nUFECi60HgYHefB7wIfDvk8lRCap2YR8MuSDklp+v5V+AzwFzgr8xsbrilqogbgBPCLkQF9QMXuvuHgI8DXxtr/84KIBHl7g+4e3/y6e+BmWGWpxLcfZ27rw+7HBVwGPCSu7/i7n3Az0mscTOmufujwJthl6NS3H2zu/8h+bgLWAfsH26pSksBpDp8GfhN2IWQktkf2JjxvIMxVrHIYGY2GzgEeCLckpRWWEvaCsWtmWJmF5NoCv+skmUrl4DrxIw1uZbi1Hj6McrMJgG/BP52NGsbRZECSIiGWzPFzM4BTgKO8TFyw06QdWLGoA5gVsbzmcCmkMoiZWRm40gEj5+5+x1hl6fU1IUVUWZ2Aol13j/n7rvDLo+U1FPAQWZ2oJk1AGeQWONGxhAzM+CnwDp3/0HY5SkHBZDo+jHQDDxoZs+Y2fKwC1RuBdaJGVOSgyMuAO4nkVi9zd2fD7dU5Wdm/wY8Dswxsw4z+0rYZSqzBcDZwKeT/4efMbMTwy5UKWkqExERCUQtEBERCUQBREREAlEAERGRQBRAREQkEAUQEREJRAFEJCTJ2VpfNbN9ks9bk8/fE3bZRIqhACISEnffCPwEuDy56XJghbu/Hl6pRIqn+0BEQpSc6mI1cD1wLnBIcoZekcjTXFgiIXL3PWb2TeA+4HgFD6km6sISCd9ngM3AwWEXRGQkFEBEQmRmHwWOI7Fi3TfMbEbIRRIpmgKISEiSs7X+hMQ6Ef8FXAlcFW6pRIqnACISnnOB/3L3B5PPlwEfNLNPhlgmkaJpFJaIiASiFoiIiASiACIiIoEogIiISCAKICIiEogCiIiIBKIAIiIigSiAiIhIIP8fnqHCQ/8tDi0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.scatterplot(X,y)\n",
    "plt.xlabel('X')\n",
    "plt.ylabel('Y')\n",
    "plt.title('Scatter plot')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (c) Set a random seed, and then compute the LOOCV errors that result from fitting the following four models using least squares:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "For model 1 , error is 8.292211622874765\n",
      "For model 2 , error is 1.01709580703398\n",
      "For model 3 , error is 1.0465534563296677\n",
      "For model 4 , error is 1.0574926712115134\n"
     ]
    }
   ],
   "source": [
    "#i. Y = β0 + β1X + eps\n",
    "#ii. Y = β0 + β1X + β2X2 +\n",
    "#iii. Y = β0 + β1X + β2X2 + β3X3 +\n",
    "#iv. Y = β0 + β1X + β2X2 + β3X3 + β4X4 + \n",
    "np.random.seed(1)\n",
    "for i in range(1,5):\n",
    "    poly = PolynomialFeatures(i,include_bias=False)\n",
    "    predictors = poly.fit_transform(X.reshape(-1,1))\n",
    "\n",
    "    lr = LinearRegression()\n",
    "    error = -1*cross_val_score(lr,predictors,y,cv = len(X),scoring = 'neg_mean_squared_error').mean()\n",
    "    print('For model {} , error is {}'.format(i,error))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Best error value if for degree = 2, and since its stimulated data, we know that the real relationship is also quadratic."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (d) Repeat (c) using another random seed, and report your results. Are your results the same as what you got in (c)? Why?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "For model 1 , error is 8.292211622874765\n",
      "For model 2 , error is 1.01709580703398\n",
      "For model 3 , error is 1.0465534563296677\n",
      "For model 4 , error is 1.0574926712115134\n"
     ]
    }
   ],
   "source": [
    "# repeting the same experiment, this time setting different seed\n",
    "np.random.seed(5)\n",
    "for i in range(1,5):\n",
    "    poly = PolynomialFeatures(i,include_bias=False)\n",
    "    predictors = poly.fit_transform(X.reshape(-1,1))\n",
    "\n",
    "    lr = LinearRegression()\n",
    "    error = -1*cross_val_score(lr,predictors,y,cv = len(X),scoring = 'neg_mean_squared_error').mean()\n",
    "    print('For model {} , error is {}'.format(i,error))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The results we got are absolutely the same, this is becaise there is no random sampling in LOOCV. Everytime, we fit n models\n",
    "# such that each time, model will be trained on n-1 observations, and than tested on a left out observations."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (f) Comment on the statistical significance of the coefficient estimates that results from fitting each of the models in (c) using least squares. Do these results agree with the conclusions drawn based on the cross-validation results?\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.088\n",
      "Model:                            OLS   Adj. R-squared:                  0.079\n",
      "Method:                 Least Squares   F-statistic:                     9.460\n",
      "Date:                Sun, 12 Jul 2020   Prob (F-statistic):            0.00272\n",
      "Time:                        17:19:38   Log-Likelihood:                -242.69\n",
      "No. Observations:                 100   AIC:                             489.4\n",
      "Df Residuals:                      98   BIC:                             494.6\n",
      "Df Model:                           1                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const         -1.7609      0.280     -6.278      0.000      -2.317      -1.204\n",
      "x1             0.9134      0.297      3.076      0.003       0.324       1.503\n",
      "==============================================================================\n",
      "Omnibus:                       40.887   Durbin-Watson:                   1.957\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               83.786\n",
      "Skew:                          -1.645   Prob(JB):                     6.40e-19\n",
      "Kurtosis:                       6.048   Cond. No.                         1.19\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.882\n",
      "Model:                            OLS   Adj. R-squared:                  0.880\n",
      "Method:                 Least Squares   F-statistic:                     362.9\n",
      "Date:                Sun, 12 Jul 2020   Prob (F-statistic):           9.26e-46\n",
      "Time:                        17:19:38   Log-Likelihood:                -140.40\n",
      "No. Observations:                 100   AIC:                             286.8\n",
      "Df Residuals:                      97   BIC:                             294.6\n",
      "Df Model:                           2                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const         -0.0216      0.122     -0.177      0.860      -0.264       0.221\n",
      "x1             1.2132      0.108     11.238      0.000       0.999       1.428\n",
      "x2            -2.0014      0.078    -25.561      0.000      -2.157      -1.846\n",
      "==============================================================================\n",
      "Omnibus:                        0.094   Durbin-Watson:                   2.221\n",
      "Prob(Omnibus):                  0.954   Jarque-Bera (JB):                0.009\n",
      "Skew:                          -0.022   Prob(JB):                        0.995\n",
      "Kurtosis:                       2.987   Cond. No.                         2.26\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.883\n",
      "Model:                            OLS   Adj. R-squared:                  0.880\n",
      "Method:                 Least Squares   F-statistic:                     242.1\n",
      "Date:                Sun, 12 Jul 2020   Prob (F-statistic):           1.26e-44\n",
      "Time:                        17:19:38   Log-Likelihood:                -139.91\n",
      "No. Observations:                 100   AIC:                             287.8\n",
      "Df Residuals:                      96   BIC:                             298.2\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const          0.0046      0.125      0.037      0.971      -0.244       0.253\n",
      "x1             1.0639      0.189      5.636      0.000       0.689       1.439\n",
      "x2            -2.0215      0.081    -24.938      0.000      -2.182      -1.861\n",
      "x3             0.0550      0.057      0.965      0.337      -0.058       0.168\n",
      "==============================================================================\n",
      "Omnibus:                        0.034   Durbin-Watson:                   2.253\n",
      "Prob(Omnibus):                  0.983   Jarque-Bera (JB):                0.050\n",
      "Skew:                           0.032   Prob(JB):                        0.975\n",
      "Kurtosis:                       2.911   Cond. No.                         6.55\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.885\n",
      "Model:                            OLS   Adj. R-squared:                  0.880\n",
      "Method:                 Least Squares   F-statistic:                     182.4\n",
      "Date:                Sun, 12 Jul 2020   Prob (F-statistic):           1.13e-43\n",
      "Time:                        17:19:38   Log-Likelihood:                -139.24\n",
      "No. Observations:                 100   AIC:                             288.5\n",
      "Df Residuals:                      95   BIC:                             301.5\n",
      "Df Model:                           4                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const          0.0866      0.144      0.600      0.550      -0.200       0.373\n",
      "x1             1.0834      0.189      5.724      0.000       0.708       1.459\n",
      "x2            -2.2455      0.214    -10.505      0.000      -2.670      -1.821\n",
      "x3             0.0436      0.058      0.755      0.452      -0.071       0.158\n",
      "x4             0.0482      0.043      1.132      0.260      -0.036       0.133\n",
      "==============================================================================\n",
      "Omnibus:                        0.102   Durbin-Watson:                   2.214\n",
      "Prob(Omnibus):                  0.950   Jarque-Bera (JB):                0.117\n",
      "Skew:                           0.069   Prob(JB):                        0.943\n",
      "Kurtosis:                       2.906   Cond. No.                         17.5\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,5):\n",
    "    # remember this time we are not setting include bias = False\n",
    "    poly = PolynomialFeatures(i)\n",
    "    predictors = poly.fit_transform(X.reshape(-1,1))\n",
    "\n",
    "    results = sm.OLS(y,predictors).fit()\n",
    "    print(results.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from the above table we notice two things - \n",
    "# Firstly, value of adjusted R_square doesn;t increase after quadratic\n",
    "# Secondly, Only p values of X1 and X2 are significant\n",
    "# Hence the best model is quadratic model, which is what we got by observing the LOOCV values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. We will now consider the Boston housing data set, from the MASS library."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(506, 14)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>B</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>MEDV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00632</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.31</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.575</td>\n",
       "      <td>65.2</td>\n",
       "      <td>4.0900</td>\n",
       "      <td>1.0</td>\n",
       "      <td>296.0</td>\n",
       "      <td>15.3</td>\n",
       "      <td>396.90</td>\n",
       "      <td>4.98</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02731</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>6.421</td>\n",
       "      <td>78.9</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>396.90</td>\n",
       "      <td>9.14</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.02729</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>7.185</td>\n",
       "      <td>61.1</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>392.83</td>\n",
       "      <td>4.03</td>\n",
       "      <td>34.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.03237</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.998</td>\n",
       "      <td>45.8</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3.0</td>\n",
       "      <td>222.0</td>\n",
       "      <td>18.7</td>\n",
       "      <td>394.63</td>\n",
       "      <td>2.94</td>\n",
       "      <td>33.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.06905</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>7.147</td>\n",
       "      <td>54.2</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3.0</td>\n",
       "      <td>222.0</td>\n",
       "      <td>18.7</td>\n",
       "      <td>396.90</td>\n",
       "      <td>5.33</td>\n",
       "      <td>36.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \\\n",
       "0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   \n",
       "1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   \n",
       "2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   \n",
       "3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   \n",
       "4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   \n",
       "\n",
       "   PTRATIO       B  LSTAT  MEDV  \n",
       "0     15.3  396.90   4.98  24.0  \n",
       "1     17.8  396.90   9.14  21.6  \n",
       "2     17.8  392.83   4.03  34.7  \n",
       "3     18.7  394.63   2.94  33.4  \n",
       "4     18.7  396.90   5.33  36.2  "
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.datasets import load_boston\n",
    "boston = load_boston()\n",
    "data = pd.DataFrame(boston.data,columns=boston.feature_names)\n",
    "data['MEDV'] = boston.target\n",
    "print(data.shape)\n",
    "data.head()\n",
    "# by the way, we only need the medv columns, so, you could save yourself some lines of code, and just do \n",
    "# medv = boston.target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (a) Based on this data set, provide an estimate for the population mean of medv. Call this estimate ˆμ.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimated population mean is  22.532806324110698\n"
     ]
    }
   ],
   "source": [
    "print('Estimated population mean is ',np.mean(data['MEDV']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (b) Provide an estimate of the standard error of ˆμ. Interpret this result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimation of standard error is  0.4084569346972867\n"
     ]
    }
   ],
   "source": [
    "print('Estimation of standard error is ',np.std(data['MEDV']) / np.sqrt(len(data)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (c) Now estimate the standard error of ˆμ using the bootstrap. How does this compare to your answer from (b)?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "def boot(data,R):\n",
    "    medv = []\n",
    "    num_samples = 500\n",
    "    for i in range(R):\n",
    "        indices = np.random.choice(data.index, num_samples, replace=True)\n",
    "        medv.append(data['MEDV'].loc[indices].mean())\n",
    "    bootstrap_statistics = {'estimated_value':np.mean(medv),'std_dev':np.std(medv)}   \n",
    "    return bootstrap_statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrap results  {'estimated_value': 22.517861600000003, 'std_dev': 0.40858473447430715}\n"
     ]
    }
   ],
   "source": [
    "result = boot(data,1000)\n",
    "print('Bootstrap results ',result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the value of standad error and mean is pretty much the same as we obtained eralier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  d) Based on your bootstrap estimate from (c), provide a 95 % confidence interval for the mean of medv."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "With 95% confidence we can say that intercal is [21.70069213105139,23.335031068948616]\n"
     ]
    }
   ],
   "source": [
    "mean = result['estimated_value']\n",
    "std = result['std_dev']\n",
    "\n",
    "print('With 95% confidence we can say that intercal is [{},{}]'.format(mean - 2*std,mean+ 2*std))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (e) Based on this data set, provide an estimate, ˆμmed, for the median value of medv in the population\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimated value median is  21.2\n"
     ]
    }
   ],
   "source": [
    "print('Estimated value median is ',np.median(data['MEDV']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (f) We now would like to estimate the standard error of ˆμmed. Unfortunately, there is no simple formula for computing the standard error of the median. Instead, estimate the standard error of the median using the bootstrap. Comment on your findings.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "def boot(data,R):\n",
    "    median = []\n",
    "    num_samples = 500\n",
    "    for i in range(R):\n",
    "        indices = np.random.choice(data.index, num_samples, replace=True)\n",
    "        median.append(data['MEDV'].loc[indices].median())\n",
    "    bootstrap_statistics = {'estimated_value':np.mean(median),'std_dev':np.std(median)}   \n",
    "    return bootstrap_statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boostrap Results for median are -  {'estimated_value': 21.176199999999998, 'std_dev': 0.3907730287519851}\n"
     ]
    }
   ],
   "source": [
    "result = boot(data,1000)\n",
    "print('Boostrap Results for median are - ',result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can see that the mean we calculated earlier is very similar to the estimated value"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (g) Based on this data set, provide an estimate for the tenth percentile of medv in Boston suburbs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimate for Quantile  -  12.75\n"
     ]
    }
   ],
   "source": [
    "print('Estimate for Quantile  - ',data['MEDV'].quantile(0.1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boostrap Results for median are -  {'estimated_value': 12.7596, 'std_dev': 0.5041569596861675}\n"
     ]
    }
   ],
   "source": [
    "def boot(data,R):\n",
    "    ten_quantiles = []\n",
    "    num_samples = 500\n",
    "    for i in range(R):\n",
    "        indices = np.random.choice(data.index, num_samples, replace=True)\n",
    "        ten_quantiles.append(data['MEDV'].loc[indices].quantile(0.1))\n",
    "    bootstrap_statistics = {'estimated_value':np.mean(ten_quantiles),'std_dev':np.std(ten_quantiles)}   \n",
    "    return bootstrap_statistics\n",
    "\n",
    "result = boot(data,1000)\n",
    "print('Boostrap Results for median are - ',result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
